In [5]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.options.plotting.backend = "plotly"

import numpy as np

from am4chart import *
In [2]:
def get_loc(df,ix,default=0,cols=None):
    try:
        if(cols):
            return df.loc[ix][cols]
        return df.loc[ix]
    except TypeError:
        return default

def normalizePer(df,col,sum_col,group_col,copy_of = None,new_name=None):
    if not new_name:
        new_name = col
    if not copy_of:
        copy_of  = col
        
    df[new_name] = df[copy_of]
    
    total = df[[group_col,sum_col]].groupby(group_col).sum()
    
    for xi in total.index:
        tot  = total.loc[xi,sum_col]
        mask = df_months[group_col]==xi
        df.loc[mask,new_name] = df[mask][new_name].divide(tot/100)
In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows    = None
In [4]:
data_folder = "../data/"
In [6]:
am4 = Amchart()
In [7]:
#plotly style:
line_traces = dict(mode='lines+markers',line_shape='spline',line_smoothing=1,marker_size=10,marker_opacity=0.9)

INFO:

Raw informations:

  • Total activities: 33,728
  • Total users: 1,052
  • Start year: 2013 (104 users)
    • Following years: 2014: 113, 2015: 105, 2016: 103, 2017: 81, 2018: 144, 2019: 155

All the data in this notebook are coming from the merge of two DBs in Ticino. This is still not perfect considering some redundancy (to be fixed in the future)

months, months_supervisors, users, activities

other notebook

Data Loading and Cleaning

In [8]:
df_users = pd.read_csv(f"{data_folder}users.csv")
users_per_year = df_users.groupby("start_year").size().reset_index(name='count')
In [9]:
month_map={1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
month_order=['Aug','Sep','Oct','Nov','Dec','Jan','Feb','Mar','Apr','May','Jun','Jul']
In [10]:
df_months = pd.read_csv(f"{data_folder}months.csv").fillna(0)
df_months.replace({'month':month_map},inplace=True)
df_months['month_order'] = df_months['month'].map(dict(zip(month_order,range(1,13))))
df_months.sort_values(by=['month_order','activity_school_year'],inplace=True)

df_months.head()
Out[10]:
month activity_school_year n_users_per_year n_logins n_activities n_recipes n_experiences avg_n_user_activities avg_n_user_recipes avg_n_user_experiences n_files n_files_recipes n_files_experiences avg_n_files avg_n_files_recipes avg_n_files_experiences std_n_files std_n_files_recipes std_n_files_experiences n_feedback_requests n_feedback_responses n_feedback_requests_recipes n_feedback_responses_recipes n_feedback_requests_experiences n_feedback_responses_experiences n_in_curriculum n_in_curriculum_recipes n_in_curriculum_experiences n_in_curriculum_insert_date n_in_curriculum_insert_date_recipes n_in_curriculum_insert_date_experiences avg_activity_total_length std_activity_total_length avg_len_descriptions std_len_descriptions avg_len_steps std_len_steps avg_len_observations std_len_observations avg_sum_len_reflections std_avg_sum_len_reflections avg_avg_len_reflections std_avg_len_reflections avg_len_bilancio std_len_bilancio avg_len_competenze std_len_competenze avg_len_miglioramenti std_len_miglioramenti avg_len_critici std_len_critici total_reflections total_null_reflections n_edits perc_total_feedback_requests perc_total_feedback_requests_recipes perc_total_feedback_requests_experiences perc_feedback_responses perc_feedback_responses_recipes perc_feedback_responses_experiences perc_in_curriculum perc_recipes_in_curriculum perc_experiences_in_curriculum perc_in_curriculum_insert_date perc_recipes_in_curriculum_insert_date perc_experiences_in_curriculum_insert_date month_order
21 Aug 1 547 175 71.0 29.0 42.0 3.55 1.45 2.10 60.0 43.0 17.0 6.0 5.0 1.0 3.3534 4.1610 1.4200 0.0 0.0 0.0 0.0 0.0 0.0 65.0 28.0 37.0 37.0 24.0 13.0 103.0 76.14 5.0 5.50 92.0 73.33 10.0 23.66 1.31 1.79 0.63 0.60 0.75 0.43 0.75 0.43 1.00 0.94 2.75 7.56 9.0 7.0 208.0 0.00 0.00 0.00 0.00 0.00 0.00 91.55 39.44 52.11 52.11 33.80 18.31 1
22 Aug 2 371 1420 532.0 273.0 259.0 6.05 3.10 2.94 1555.0 1300.0 255.0 10.0 9.0 2.0 4.8276 5.8386 4.0261 89.0 72.0 54.0 48.0 35.0 24.0 478.0 232.0 246.0 26.0 12.0 14.0 128.0 86.92 7.0 12.96 113.0 77.48 5.0 12.03 7.26 9.32 4.48 7.21 7.15 11.64 9.72 13.62 7.02 15.62 5.14 11.72 173.0 51.0 3206.0 16.73 19.78 13.51 13.53 17.58 9.27 89.85 43.61 46.24 4.89 2.26 2.63 1
23 Aug 3 224 910 585.0 152.0 433.0 9.75 2.53 7.22 654.0 490.0 164.0 11.0 10.0 1.0 6.0734 7.4977 2.8684 51.0 29.0 17.0 14.0 34.0 15.0 516.0 90.0 426.0 0.0 0.0 0.0 169.0 125.11 8.0 15.89 153.0 113.59 6.0 12.37 6.23 5.72 4.70 5.29 5.12 11.23 10.37 15.02 4.63 6.36 4.79 8.43 158.0 12.0 5107.0 8.72 11.18 7.85 4.96 9.21 3.46 88.21 15.38 72.82 0.00 0.00 0.00 1
24 Sep 1 547 521 173.0 96.0 77.0 2.62 1.45 1.17 355.0 248.0 107.0 8.0 6.0 2.0 5.2481 6.2521 2.8655 28.0 18.0 24.0 14.0 4.0 4.0 123.0 52.0 71.0 19.0 15.0 4.0 120.0 104.80 7.0 11.09 106.0 94.23 11.0 21.22 8.72 10.38 4.62 8.28 7.57 11.91 13.61 24.10 8.36 12.45 5.33 9.47 43.0 11.0 674.0 16.18 25.00 5.19 10.40 14.58 5.19 71.10 30.06 41.04 10.98 8.67 2.31 2
25 Sep 2 371 1666 614.0 303.0 311.0 3.81 1.88 1.93 1807.0 1299.0 508.0 10.0 8.0 2.0 5.7121 6.9026 4.6594 106.0 73.0 63.0 45.0 43.0 28.0 567.0 266.0 301.0 56.0 19.0 37.0 146.0 128.25 13.0 30.95 123.0 102.28 9.0 14.74 7.08 7.56 4.88 6.78 6.28 12.30 10.13 12.50 6.05 9.75 5.88 11.39 221.0 91.0 4782.0 17.26 20.79 13.83 11.89 14.85 9.00 92.35 43.32 49.02 9.12 3.09 6.03 2
In [11]:
y_users = df_months['n_users_per_year']
df_months['norm_avg_n_user_recipes']        = df_months['n_recipes'].divide(y_users)
df_months['norm_avg_n_user_experiences']    = df_months['n_experiences'].divide(y_users)
df_months['norm_avg_n_activities']          = df_months['n_activities'].divide(y_users)

Data Exploring

Apprentices

# of activities

In [11]:
fig = df_months.plot(x="month", y=["n_recipes","n_experiences","n_feedback_responses","n_edits"],facet_col='activity_school_year')
fig.update_layout(
    title="Activities, feedback requests, feedback responses",
    title_x=0.5,
    yaxis_title="count",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
#fig.update_yaxes(type="log")
fig.update_traces(line_traces)

fig.show()
In [12]:
fig = df_months.plot(x="month", y=["n_activities"],facet_col='activity_school_year')
fig.update_layout(
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="# activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [13]:
fig = df_months.plot(x="month", y=["norm_avg_n_activities"],facet_col='activity_school_year')
fig.update_layout(
    title="Normalized number of activities per users per month",
    title_x=0.5,
    yaxis_title="average activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [14]:
fig = df_months.plot.bar(x="month", y=["norm_avg_n_user_recipes","norm_avg_n_user_experiences"],facet_col='activity_school_year')
fig.update_layout(
    title="Average number of activities per user per month",
    title_x=0.5,
    yaxis_title="average activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()

Files

In [15]:
# normalize wrt number of activities per month
df_months['norm_n_files_recipes'] = df_months['n_files_recipes'].divide(df_months['n_activities'])
df_months['norm_n_files_experiences'] = df_months['n_files_experiences'].divide(df_months['n_activities'])


fig = df_months.plot.bar(x="month", y=['norm_n_files_recipes','norm_n_files_experiences'],facet_col='activity_school_year')
fig.update_layout(
    title="Number of activities's files per per activity per month",
    title_x=0.5,
    yaxis_title="# of files per activity",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()

And the average grouped by years:

In [16]:
df_months[['activity_school_year','n_files','avg_n_files','std_n_files']]\
    .groupby('activity_school_year').mean()
Out[16]:
n_files avg_n_files std_n_files
activity_school_year
1 2738.500000 9.666667 6.282992
2 1906.083333 10.583333 7.111450
3 853.666667 9.833333 5.787942

Feedbacks and Curriculum

In [17]:
fig = df_months.plot.bar(x="month", y=['perc_total_feedback_requests_recipes','perc_total_feedback_requests_experiences']
                         ,facet_col='activity_school_year')
fig.update_layout(
    title="% of activities with requests for feedback per month",
    title_x=0.5,
    yaxis_title="% activities with request",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [18]:
normalizePer(df_months,'perc_total_feedback_requests_recipes',
             'n_feedback_requests','activity_school_year','n_feedback_requests_recipes')
normalizePer(df_months,'perc_total_feedback_requests_experiences',
             'n_feedback_requests','activity_school_year','n_feedback_requests_experiences')
In [19]:
fig = df_months.plot.bar(x="month", y=['perc_total_feedback_requests_recipes','perc_total_feedback_requests_experiences'], 
                         facet_col='activity_school_year')
fig.update_layout(
    title="% of feedback requests over the school year",
    title_x=0.5,
    yaxis_title="% activities with request",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()

TODO: try with request date

In [20]:
fig = df_months.plot.bar(x="month", y=['perc_in_curriculum'],
                         facet_col='activity_school_year')
fig.update_layout(
    title="% of activities in curriculum per month",
    title_x=0.5,
    yaxis_title="% activities in curriculum",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()

TODO: CONTROLLARE BENE LA QUERY n_activities per via del "final". TODO: vedere se moltiplicare i due

In [21]:
normalizePer(df_months,'norm_perc_recipes_in_curriculum',
             'n_activities','activity_school_year','n_in_curriculum_recipes')
normalizePer(df_months,'norm_perc_experiences_in_curriculum',
             'n_activities','activity_school_year','n_in_curriculum_experiences')
In [22]:
fig = df_months.plot.bar(x="month", y=['norm_perc_recipes_in_curriculum','norm_perc_experiences_in_curriculum'], 
                         facet_col='activity_school_year')
fig.update_layout(
    title="Normalized % of activities in curriculum per month",
    title_x=0.5,
    yaxis_title="% activities in curriculum",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()

Lengths

(total length: description + steps + observations)

In [23]:
fig = df_months.plot.bar(x="month", y=['avg_activity_total_length'], 
                         facet_col='activity_school_year')
fig.update_layout(
    title="Activity total length per month",
    title_x=0.5,
    yaxis_title="Total lenght average",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [24]:
fig = df_months.plot.bar(x="month", y=['avg_len_descriptions','avg_len_steps','avg_len_observations'],
                         facet_col='activity_school_year')
fig.update_layout(
    title="Activity total length per month",
    title_x=0.5,
    yaxis_title="Total lenght average",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [25]:
df_months[['activity_school_year','avg_activity_total_length','std_activity_total_length']]\
    .groupby('activity_school_year').mean()
Out[25]:
avg_activity_total_length std_activity_total_length
activity_school_year
1 134.666667 120.981667
2 152.500000 151.664167
3 153.166667 128.285000

Note: std very high because of NULL descriptions. TODO: try without NULL descriptions

In [26]:
fig = df_months.plot.bar(x="month", y=['avg_sum_len_reflections'], 
                         facet_col='activity_school_year')
fig.update_layout(
    title="Average reflections total length per month",
    title_x=0.5,
    yaxis_title="Total average lenght average",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [27]:
fig = df_months.plot.bar(x="month", y=['avg_len_bilancio','avg_len_competenze','avg_len_miglioramenti','avg_len_critici'],
                         facet_col='activity_school_year')
fig.update_layout(
    title="Total length of average reflections per month",
    title_x=0.5,
    yaxis_title="Total lenght of average",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()

Edits

In [28]:
fig = df_months.plot(x="month", y=["n_edits"],facet_col='activity_school_year')
fig.update_layout(
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="# activities",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Logins

In [29]:
dayofweek_map = {0:'Sun',1:'Mon',2:'Tue',3:'Wed',4:'Thu',5:'Fri',6:'Sat'}

df_students_logins = pd.read_csv(f"{data_folder}students_logins.csv")\
                        .drop(['ut_user_type','start_semester','start_year'],axis=1)
df_students_logins['date'] = df_students_logins['date'].astype('datetime64')
df_students_logins.head()
Out[29]:
us_user date user_school_year month dayofweek hour minute
0 20 2013-11-25 15:30:13 1 11 2 15 30
1 20 2013-12-19 14:11:06 1 12 5 14 11
2 20 2014-01-08 18:50:53 1 1 4 18 50
3 20 2014-01-08 20:14:51 1 1 4 20 14
4 20 2014-01-14 10:56:35 1 1 3 10 56
In [30]:
date_hist_students_logins = pd.DataFrame(df_students_logins['date'].groupby([df_students_logins["date"].dt.dayofweek,df_students_logins["date"].dt.hour]).count())
date_hist_students_logins = date_hist_students_logins.rename(columns={"date": "count"})
date_hist_students_logins = pd.DataFrame([(x,y,get_loc(date_hist_students_logins,(x,y),0,'count')) for x in range(7) for y in range(24)])
date_hist_students_logins.columns = ['dayofweek','hour','count']
date_hist_students_logins.replace({'dayofweek':dayofweek_map},inplace=True)
In [31]:
fig = date_hist_students_logins.plot(x="hour", y=["count"],facet_col='dayofweek',color='dayofweek')
fig.update_layout(
    title="Apprentices logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()
In [32]:
fig = date_hist_students_logins.plot(x="hour", y=["count"],line_group='dayofweek', color='dayofweek')
fig.update_layout(
    title="Apprentices logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [33]:
fig = date_hist_students_logins.groupby("hour").sum().reset_index().plot(x="hour", y=["count"])
fig.update_layout(
    title="Cumulate apprentices logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()

Supervisors

In [34]:
df_months_supervisors = pd.read_csv(f"{data_folder}months_supervisors.csv").fillna(0).sort_values(by=['month','activity_school_year'])

df_months_supervisors.replace({'month':month_map},inplace=True)
df_months_supervisors['month_order'] = df_months['month'].map(dict(zip(month_order,range(1,13))))
df_months_supervisors.sort_values(by=['month_order','activity_school_year'],inplace=True)

df_months_supervisors.head()
Out[34]:
month activity_school_year n_users_per_year n_logins n_activities n_recipes n_experiences avg_n_user_activities avg_n_user_recipes avg_n_user_experiences n_files n_files_recipes n_files_experiences avg_n_files avg_n_files_recipes avg_n_files_experiences std_n_files std_n_files_recipes std_n_files_experiences n_feedback_requests n_feedback_responses n_feedback_requests_recipes n_feedback_responses_recipes n_feedback_requests_experiences n_feedback_responses_experiences n_in_curriculum n_in_curriculum_recipes n_in_curriculum_experiences n_in_curriculum_insert_date n_in_curriculum_insert_date_recipes n_in_curriculum_insert_date_experiences avg_activity_total_length std_activity_total_length avg_len_descriptions std_len_descriptions avg_len_steps std_len_steps avg_len_observations std_len_observations avg_sum_len_reflections std_avg_sum_len_reflections avg_avg_len_reflections std_avg_len_reflections avg_len_bilancio std_len_bilancio avg_len_competenze std_len_competenze avg_len_miglioramenti std_len_miglioramenti avg_len_critici std_len_critici total_reflections total_null_reflections n_edits perc_total_feedback_requests perc_total_feedback_requests_recipes perc_total_feedback_requests_experiences perc_feedback_responses perc_feedback_responses_recipes perc_feedback_responses_experiences perc_in_curriculum perc_recipes_in_curriculum perc_experiences_in_curriculum perc_in_curriculum_insert_date perc_recipes_in_curriculum_insert_date perc_experiences_in_curriculum_insert_date month_order
21 Aug 1 227 71 71.0 29.0 42.0 3.55 1.45 2.10 60.0 43.0 17.0 6.0 5.0 1.0 3.3534 4.1610 1.4200 0.0 0.0 0.0 0.0 0.0 0.0 65.0 28.0 37.0 37.0 24.0 13.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 208.0 0.00 0.00 0.00 0.00 0.00 0.00 91.55 39.44 52.11 52.11 33.80 18.31 1
22 Aug 2 101 21 532.0 273.0 259.0 6.05 3.10 2.94 1555.0 1300.0 255.0 10.0 9.0 2.0 4.8276 5.8386 4.0261 89.0 72.0 54.0 48.0 35.0 24.0 478.0 232.0 246.0 26.0 12.0 14.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.72 7.88 3.41 6.63 5.57 9.85 6.13 10.28 6.68 18.96 4.53 9.85 39.0 31.0 3206.0 16.73 19.78 13.51 13.53 17.58 9.27 89.85 43.61 46.24 4.89 2.26 2.63 1
23 Aug 3 40 10 585.0 152.0 433.0 9.75 2.53 7.22 654.0 490.0 164.0 11.0 10.0 1.0 6.0734 7.4977 2.8684 51.0 29.0 17.0 14.0 34.0 15.0 516.0 90.0 426.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.71 5.61 4.33 4.73 3.91 6.62 8.76 12.78 4.93 8.99 5.24 10.44 19.0 10.0 5107.0 8.72 11.18 7.85 4.96 9.21 3.46 88.21 15.38 72.82 0.00 0.00 0.00 1
24 Sep 1 227 64 173.0 96.0 77.0 2.62 1.45 1.17 355.0 248.0 107.0 8.0 6.0 2.0 5.2481 6.2521 2.8655 28.0 18.0 24.0 14.0 4.0 4.0 123.0 52.0 71.0 19.0 15.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.95 6.67 5.03 5.18 5.97 8.85 12.39 17.23 7.78 10.72 5.67 8.23 16.0 3.0 674.0 16.18 25.00 5.19 10.40 14.58 5.19 71.10 30.06 41.04 10.98 8.67 2.31 2
25 Sep 2 101 106 614.0 303.0 311.0 3.81 1.88 1.93 1807.0 1299.0 508.0 10.0 8.0 2.0 5.7121 6.9026 4.6594 106.0 73.0 63.0 45.0 43.0 28.0 567.0 266.0 301.0 56.0 19.0 37.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5.43 6.56 3.59 6.19 4.93 12.00 6.39 8.80 4.52 7.33 5.89 12.48 25.0 34.0 4782.0 17.26 20.79 13.83 11.89 14.85 9.00 92.35 43.32 49.02 9.12 3.09 6.03 2

Feedbacks

In [35]:
normalizePer(df_months_supervisors,'perc_total_feedback_responses_recipes',
             'n_feedback_responses','activity_school_year','n_feedback_responses_recipes')
normalizePer(df_months_supervisors,'perc_total_feedback_responses_experiences',
             'n_feedback_responses','activity_school_year','n_feedback_responses_experiences')
In [36]:
fig = df_months_supervisors.plot.bar(x="month", y=['perc_total_feedback_responses_recipes','perc_total_feedback_responses_experiences'],
                         facet_col='activity_school_year')
fig.update_layout(
    title="% of feedback responses over the school year",
    title_x=0.5,
    yaxis_title="% of responses",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [37]:
df_months_supervisors['ration_response'] = (df_months_supervisors['n_feedback_responses']/df_months['n_feedback_requests']).fillna(0)
In [38]:
fig = df_months_supervisors.plot.bar(x="month", y=['ration_response'],
                         facet_col='activity_school_year')
fig.update_layout(
    title="Ratio responses/requests",
    title_x=0.5,
    yaxis_title="responses/requests",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [39]:
fig = df_months_supervisors.plot.bar(x="month", y=['avg_len_bilancio','avg_len_competenze','avg_len_miglioramenti','avg_len_critici'], 
                         facet_col='activity_school_year')
fig.update_layout(
    title="Total length of average feedbacks per month",
    title_x=0.5,
    yaxis_title="Total length",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)

fig.show()
In [ ]:
 
In [40]:
df_supervisors_feedback = pd.read_csv(f"{data_folder}supervisors_feedbacks.csv")
df_supervisors_feedback.fillna(0, inplace=True)
df_supervisors_feedback['ratio'] = df_supervisors_feedback['sent']/df_supervisors_feedback['received']
df_supervisors_feedback.loc[df_supervisors_feedback['ratio']>1,'ratio'] = 1.0
In [41]:
fig = df_supervisors_feedback.plot.hist(x="ratio",nbins=40)
fig.update_layout(
    title="Histogram: ratio responses/requests per supervisor",
    title_x=0.5,
    yaxis_title="Count",
)

fig.show()
In [42]:
received_trigger_min = 0
received_trigger_max = 150

df_trigger=df_supervisors_feedback[(df_supervisors_feedback['received']>=received_trigger_min) & (df_supervisors_feedback['received']<=received_trigger_max)]\
    .sort_values(by="ratio")

fig = df_trigger.plot.bar(x="us_user",y="ratio",color="received")
fig.update_layout(
    title="Ratio responses/requests per each supervisor", 
    title_x=0.5,
    xaxis_title='supervisor',
    xaxis_type='category'
)

fig.update_xaxes(showticklabels=False)
fig.show()
In [43]:
received_trigger_min = 5
received_trigger_max = 150

df_trigger=df_supervisors_feedback[(df_supervisors_feedback['received']>=received_trigger_min) & (df_supervisors_feedback['received']<=received_trigger_max)]\
    .sort_values(by="ratio")

fig = df_trigger\
            .plot.scatter(x="us_user",y="ratio",color="received")
fig.update_layout(
    title="Ratio responses/requests per each supervisor", 
    title_x=0.5,
    xaxis_title='supervisor',
    xaxis_type='category'
)
fig.update_traces(mode='markers',opacity=0.8,)

fig.add_trace(df_trigger.plot.bar(x="us_user",y="ratio",color="received").data[0])

fig.update_xaxes(showticklabels=False)
fig.show()
In [44]:
bests=df_supervisors_feedback.sort_values(by=["ratio","received"],ascending=False).head(10)
In [45]:
bests.set_index("us_user").join(df_users[['us_user','user_email','user_name']].set_index("us_user"))\
    [['user_name','ratio','received']]
Out[45]:
user_name ratio received
us_user
663 Carlo Giovio 1.0 301
500485 Mirjam Trinkler 1.0 129
500994 René Studer 1.0 79
500093 Tosi Loris 1.0 60
5 NaN 1.0 46
424 Roberto Danesi 1.0 43
787 Marco Viviani 1.0 36
330 Giovanni Guidicelli 1.0 34
500194 Francesco Perrone 1.0 23
500902 Philip Ries 1.0 22
In [ ]:
 
In [13]:
df_feedbacks_info = pd.read_csv(f"{data_folder}activities_feedbacks_info.csv")
df_feedbacks_info.head()
Out[13]:
ac_activity sender recipient request_date response_date delay_hours edits_between edits_after times_before_answer student_grade supervisor_grade activity_school_year start_year
0 438 20 86 2014-02-04 13:57:46 NaN NaN 0 0 1 4.0 NaN 1 2013
1 579 20 86 2014-02-04 15:20:18 NaN NaN 0 0 1 4.5 NaN 1 2013
2 723 65 87 2014-02-20 14:42:12 2014-11-17 20:59:09 6486.0 1 0 1 5.0 4.0 1 2013
3 844 20 86 2014-02-25 13:32:38 NaN NaN 0 0 2 4.0 NaN 1 2013
4 845 20 86 2014-02-25 14:15:32 NaN NaN 0 0 1 4.5 NaN 1 2013
In [14]:
no_answer = df_feedbacks_info['student_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without student grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 363/4868 without student grade (7.46%)
In [15]:
no_answer = df_feedbacks_info['response_date'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without answer ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 2891/4868 without answer (59.39%)
In [16]:
no_answer = df_feedbacks_info['supervisor_grade'].isnull().sum()
print(f'There are {no_answer}/{len(df_feedbacks_info)} without supervisor grade ({round(no_answer/len(df_feedbacks_info),4)*100}%)')
There are 2961/4868 without supervisor grade (60.83%)
In [17]:
response_no_grade = (df_feedbacks_info['response_date'].notnull() & df_feedbacks_info['supervisor_grade'].isnull()).sum()
grade_no_response = (df_feedbacks_info['response_date'].isnull() & df_feedbacks_info['supervisor_grade'].notnull()).sum()

print(f'There are {response_no_grade} responses without grades')
There are 155 responses without grades
In [18]:
grade_no_response #TODO: VERIFICARE QUESTO!
Out[18]:
85
In [19]:
df_feedbacks_info.dropna(inplace=True)
df_feedbacks_info['delay_days'] = df_feedbacks_info['delay_hours']/24
In [20]:
df_hist = df_feedbacks_info.groupby(["activity_school_year","times_before_answer"]).count()[['ac_activity']].reset_index().rename(columns={'ac_activity': 'count'})


fig = df_hist.plot.hist(x="times_before_answer", y='count', 
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    xaxis_type='category',
    title="Requests before a feedback",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="Number of requests before the response",
    coloraxis_showscale=False,
)

fig.show()
In [21]:
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days",  nbins=30,
                         color = "activity_school_year" )

fig.update_layout(
    barmode='group',
    title="Histogram: days before an answer",
    title_x=0.5,
    yaxis_title="Count",
    xaxis_title="days before an answer",
    coloraxis_showscale=False,
)

fig.show()
In [22]:
fig = df_feedbacks_info[df_feedbacks_info['delay_days']<100].plot.hist(x="delay_days",  nbins=30,
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    title="Histogram: normalized number of days before an answer",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="days before an answer",
    coloraxis_showscale=False,
)

fig.show()
In [ ]:
 
In [23]:
fig = df_feedbacks_info[df_feedbacks_info['edits_after']<5].plot.hist(x="edits_after",  nbins=5,
                         color = "activity_school_year", histnorm='percent' )

fig.update_layout(
    barmode='group',
    title="Histogram: normalized number of edits after a feedback",
    title_x=0.5,
    yaxis_title="Percent",
    xaxis_title="# of edits after an answer per feedback request",
)

fig.show()
In [101]:
df_feedbacks_info['has_edit_after'] = (df_feedbacks_info['edits_after']>0).astype(int)
In [141]:
df_count = df_feedbacks_info.groupby(["activity_school_year","supervisor_grade","has_edit_after"]).size().reset_index(name='count')

c = (pd.core.reshape.util.cartesian_product([df_count['activity_school_year'].unique(),df_count['supervisor_grade'].unique(),df_count['has_edit_after'].unique()]))
c = pd.DataFrame(dict(activity_school_year=c[0],supervisor_grade=c[1],has_edit_after=c[2]))
c = c.set_index(["activity_school_year","supervisor_grade","has_edit_after"]).join(df_count.set_index(["activity_school_year","supervisor_grade","has_edit_after"])).reset_index().sort_values(by=['activity_school_year','supervisor_grade'])
c.fillna(0,inplace=True)

df_feedback_edits =  c[c['has_edit_after']==True].drop("has_edit_after",axis=1).rename(columns={'count':'edit'})
df_feedback_edits['no_edit'] = c[c['has_edit_after']==False]['count'].tolist()

df_feedback_edits['ratio'] = df_feedback_edits['edit'].div(df_feedback_edits['no_edit']+df_feedback_edits['edit']).fillna(0)
In [151]:
fig = df_feedback_edits.plot(x="supervisor_grade", y=["edit"],
                             line_group='activity_school_year', color='activity_school_year')
fig.update_layout(
    title="# activities that have been edit after a feedback per grade",
    title_x=0.5,
    yaxis_title="# activities",
    xaxis_title="supervisor grade",
)
fig.update_traces(line_traces)

fig.show()

Logins

In [182]:
df_supervisors_logins = pd.read_csv(f"{data_folder}supervisors_logins.csv")\
                        .drop(['ut_user_type','start_semester','start_year'],axis=1)
df_supervisors_logins['date'] = df_supervisors_logins['date'].astype('datetime64')

date_hist_supervisors_logins = pd.DataFrame(df_supervisors_logins['date'].groupby([df_supervisors_logins["date"].dt.dayofweek,df_supervisors_logins["date"].dt.hour]).count())
date_hist_supervisors_logins = date_hist_supervisors_logins.rename(columns={"date": "count"})
date_hist_supervisors_logins = pd.DataFrame([(x,y,get_loc(date_hist_supervisors_logins,(x,y),0,'count')) for x in range(7) for y in range(24)])
date_hist_supervisors_logins.columns = ['dayofweek','hour','count']

date_hist_supervisors_logins.replace({'dayofweek':dayofweek_map},inplace=True)
In [183]:
fig = date_hist_supervisors_logins.plot(x="hour", y=["count"],facet_col='dayofweek',color='dayofweek')
fig.update_layout(
    title="Supervisors logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))

fig.show()
In [184]:
fig = date_hist_supervisors_logins.plot(x="hour", y=["count"],line_group='dayofweek', color='dayofweek')
fig.update_layout(
    title="Supervisors logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [196]:
fig = date_hist_supervisors_logins.groupby("hour").sum().reset_index().plot(x="hour", y=["count"])
fig.update_layout(
    title="Cumulate supervisors logins",
    title_x=0.5,
    yaxis_title="# logins",
    legend_orientation="h",legend=dict(x=0.5, y=-0.1,xanchor='center',yanchor='top'),legend_title_text=''
)
fig.update_xaxes(title=dict(text=""),tickangle=45)
fig.update_traces(line_traces)

fig.show()
In [207]:
plt1 = date_hist_supervisors_logins.groupby("hour").sum().reset_index()
plt2 = date_hist_students_logins.groupby("hour").sum().reset_index()
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=plt1['hour'], y=plt1['count'],
                    name='Supervisors'))
fig.add_trace(go.Scatter(x=plt2['hour'], y=plt2['count'],
                    name='Apprentices'))

fig.update_layout(
    title="Cumulate logins",
    title_x=0.5,
    yaxis_title="# Log10 logins",
    xaxis_title="hour of the day",
    yaxis_type="log",
)
fig.update_traces(line_traces)

fig.show()

Years considerations:

In [17]:
df_years = pd.read_csv(f"{data_folder}years_test.csv")
df_years.head()
Out[17]:
month activity_school_year start_year n_activities n_recipes n_experiences avg_n_user_activities avg_n_user_recipes avg_n_user_experiences
0 1 1 2013 176 150.0 26.0 5.03 4.29 0.74
1 1 1 2014 245 191.0 54.0 5.98 4.66 1.32
2 1 1 2015 170 69.0 101.0 5.15 2.09 3.06
3 1 1 2016 131 98.0 33.0 3.45 2.58 0.87
4 1 1 2017 160 109.0 51.0 4.32 2.95 1.38
In [84]:
df_only_years = df_years.groupby(["activity_school_year","start_year"]).sum().reset_index()
df_only_years['activity_school_year']=df_only_years['activity_school_year'].astype(str)

fig = df_only_years.plot.hist(x="start_year", y=['avg_n_user_activities'],color="activity_school_year")
fig.update_layout(
    xaxis_type='category',
    barmode='group',
    title="Total number of activities per month",
    title_x=0.5,
    yaxis_title="Number of activities",
)

fig.show()
In [ ]:
 
In [67]:
feedbacks_per_year = df_feedbacks_info.groupby('start_year').size().reset_index(name="count")
feedbacks_per_year['norm_count'] = feedbacks_per_year['count'].div(users_per_year['count'])

fig = feedbacks_per_year.plot.scatter(x="start_year",y="norm_count", size="count", color="count")
fig.update_layout(
    title="Normalized feedbacks per year",
    title_x=0.5,
)
fig.update_traces(mode='lines+markers',line_shape='spline',line_smoothing=0.5)

fig.show()
In [ ]:
 
In [ ]:
 
In [ ]: